##Libraries

library(tidyverse)
library(data.table)
library(matrixStats)
library(lubridate)
library(patchwork)
library(ggraph)
library(igraph)
library(ape)

Load data

# The raw sequence data is available as part of the COG-UK data set (PRJEB37886).
# However, the association between sequences and metadata has been retained to
# protect the privacy of patients. To request access please contact the authors.
# msa <- read.dna('./data/alignment_carehome_20200720.fasta', format = 'fasta')

meta <- fread("./data/carehome_clustering_input_metadata_20200720.csv") %>% as_tibble()
meta$collection_date_filled <- as.Date(meta$collection_date_filled, format = "%d/%m/%y")

write.csv(meta[, 1:2], file = "./processed_data/dates.csv", quote = FALSE, row.names = FALSE)

run the transcluster algorithm with liberal thresholds initially

fasttranscluster -o processed_data --save_probs --msa data/alignment_carehome_20200720.fasta --dates processed_data/dates.csv -K 20 --snp_threshold 15 -t 4

fasttranscluster -o processed_data_check_cog --save_probs --msa ./data/check_with_cog/carehome_public_genomes_align_update.fasta --dates processed_data/dates.csv -K 20 --snp_threshold 15 -t 4
import pyfastx

headers = []
for name, seq in pyfastx.Fasta('data/alignment_carehome_20200720.fasta', build_index=False):
    keep_seq = seq
    headers.append(name)

with open('./processed_data_identical/identical.fasta' ,'w') as outfile:
    for h in headers:
        outfile.write(">"+h+"\n"+keep_seq+"\n")
fasttranscluster -o processed_data_identical --save_probs --msa ./processed_data_identical/identical.fasta --dates processed_data/dates.csv -K 20 --snp_threshold 15 -t 4
trans <- fread("./processed_data/transcluster_probabilities.csv") %>% as_tibble()
trans$care_home_A <- meta$anonymised_care_home_code[match(trans$sampleA, meta$anonymised_sample_id)]
trans$care_home_B <- meta$anonymised_care_home_code[match(trans$sampleB, meta$anonymised_sample_id)]
trans$dateA <- meta$collection_date_filled[match(trans$sampleA, meta$anonymised_sample_id)]
trans$dateB <- meta$collection_date_filled[match(trans$sampleB, meta$anonymised_sample_id)]
trans$date_diff <- abs(trans$dateA - trans$dateB)
all_care_homes <- unique(meta$anonymised_care_home_code)

nintrodictions <- map_dfr(seq(0, 1, 0.005), function(prob) {
    nintros <- map_dbl(all_care_homes, ~{
        temp_trans <- trans %>% filter(care_home_A == .x) %>% filter(care_home_B == 
            .x)
        
        temp_samples <- meta$anonymised_sample_id[meta$anonymised_care_home_code == 
            .x]
        
        if (length(temp_samples) < 2) 
            return(1)
        
        d <- matrix(0, nrow = length(temp_samples), ncol = length(temp_samples), 
            dimnames = list(temp_samples, temp_samples))
        
        d[as.matrix(temp_trans[, c("sampleA", "sampleB")])] <- exp(apply(temp_trans[, 
            c("0", "1", "2")], 1, logSumExp))
        d[as.matrix(temp_trans[, c("sampleB", "sampleA")])] <- d[as.matrix(temp_trans[, 
            c("sampleA", "sampleB")])]
        diag(d) <- 1
        
        h <- hclust(as.dist(1 - d), method = "single")
        clust <- cutree(h, h = 1 - prob)
        
        return(length(unique(clust)))
    })
    tibble(cutoff = prob, nintros = list(nintros))
})

nintrodictions$max_intros <- map_dbl(nintrodictions$nintros, max)
nintrodictions$mean_intros <- map_dbl(nintrodictions$nintros, mean)
nintrodictions$total_intros <- map_dbl(nintrodictions$nintros, sum)

ggplot(nintrodictions, aes(x = cutoff, y = total_intros)) + geom_point() + theme_bw(base_size = 12) + 
    geom_vline(xintercept = 0.15, color = "red") + xlab("cutoff probability \n minimum probability cases are seperated by at most 2 intermediate hosts") + 
    ylab("total number of clusters across all care homes")

ggsave("./Figures/num_clusters_by_threshold.png", width = 12, height = 7)
ggsave("./Figures/num_clusters_by_threshold.pdf", width = 12, height = 7)
## minimum probability of a transmission with at most 2 intermediate hosts
MIN_PROB <- 0.15

trans_with_clusters <- map_dfr(all_care_homes, ~{
    temp_trans <- trans %>% filter(care_home_A == .x) %>% filter(care_home_B == .x)
    
    temp_samples <- meta$anonymised_sample_id[meta$anonymised_care_home_code == .x]
    
    if ((length(temp_samples) < 2) | (nrow(temp_trans) < 1)) 
        return(tibble())
    
    d <- matrix(0, nrow = length(temp_samples), ncol = length(temp_samples), dimnames = list(temp_samples, 
        temp_samples))
    
    d[as.matrix(temp_trans[, c("sampleA", "sampleB")])] <- exp(apply(temp_trans[, 
        c("0", "1", "2")], 1, logSumExp))
    d[as.matrix(temp_trans[, c("sampleB", "sampleA")])] <- d[as.matrix(temp_trans[, 
        c("sampleA", "sampleB")])]
    diag(d) <- 1
    
    h <- hclust(as.dist(1 - d), method = "single")
    clust <- cutree(h, h = 1 - MIN_PROB)
    
    temp_trans$clustA <- paste(.x, clust[temp_trans$sampleA], sep = "_")
    temp_trans$clustB <- paste(.x, clust[temp_trans$sampleB], sep = "_")
    
    return(temp_trans %>% filter(clustA == clustB))
})

plot histograms of resulting SNP and date differences between inferred links

ggplot(trans_with_clusters, aes(x = snp_distance)) + geom_bar(fill = "#4682B4") + 
    theme_bw(base_size = 14) + xlab("SNP distance")

ggsave("./Figures/snp_dist_histo.png", width = 9, height = 7)
ggsave("./Figures/snp_dist_histo.pdf", width = 9, height = 7)
ggplot(trans_with_clusters, aes(x = date_diff)) + geom_bar(fill = "#4682B4") + theme_bw(base_size = 14) + 
    xlab("date difference")

ggsave("./Figures/date_dist_histo.png", width = 9, height = 7)
ggsave("./Figures/date_dist_histo.pdf", width = 9, height = 7)
plotdf <- trans_with_clusters
plotdf$cluster <- factor(plotdf$clustA, levels = unique(names(sort(map_dbl(split(trans_with_clusters$date_diff, 
    trans_with_clusters$clustA), median), decreasing = TRUE))))

ggplot(plotdf, aes(x = cluster, y = date_diff, group = clustA)) + geom_boxplot(fill = "#4682B4", 
    outlier.colour = NA) + geom_point() + theme_bw(base_size = 14) + ylab("date difference") + 
    xlab("cluster")

ggsave("./Figures/date_dist_boxplot.png", width = 9, height = 7)
ggsave("./Figures/date_dist_boxplot.pdf", width = 9, height = 7)

plot number of samples versus number of clusters by care home

clusters <- map_dfr(all_care_homes, ~{
    temp_trans <- trans %>% filter(care_home_A == .x) %>% filter(care_home_B == .x)
    
    temp_samples <- meta$anonymised_sample_id[meta$anonymised_care_home_code == .x]
    
    if ((length(temp_samples) < 2) | (nrow(temp_trans) < 1)) 
        return(tibble(home = .x, sample = temp_samples, cluster = paste(.x, 1:length(temp_samples), 
            sep = "_")))
    
    d <- matrix(0, nrow = length(temp_samples), ncol = length(temp_samples), dimnames = list(temp_samples, 
        temp_samples))
    
    d[as.matrix(temp_trans[, c("sampleA", "sampleB")])] <- exp(apply(temp_trans[, 
        c("0", "1", "2")], 1, logSumExp))
    d[as.matrix(temp_trans[, c("sampleB", "sampleA")])] <- d[as.matrix(temp_trans[, 
        c("sampleA", "sampleB")])]
    diag(d) <- 1
    
    h <- hclust(as.dist(1 - d), method = "single")
    clust <- cutree(h, h = 1 - MIN_PROB)
    
    missing <- temp_samples[!temp_samples %in% names(clust)]
    
    return(tibble(home = .x, sample = c(names(clust), missing), cluster = c(paste(.x, 
        clust, sep = "_"), paste(rep(.x, length(missing)), max(clust) + seq_along(missing), 
        sep = "_"))))
})

# write out clusters
write.csv(clusters, file = "./processed_data/final_clusters.csv", quote = FALSE, 
    row.names = FALSE)

nintros_by_home <- clusters %>% group_by(home) %>% summarise(n_samples = n(), n_intros = length(unique(cluster))) %>% 
    arrange(n_samples)
nintros_by_home$home <- factor(nintros_by_home$home, levels = unique(nintros_by_home$home))

sum(nintros_by_home$n_intros)
## [1] 409
plotdf <- nintros_by_home %>% group_by(n_samples, n_intros) %>% summarise(count = n())

ggplot(plotdf, aes(x = n_samples, y = n_intros, size = count)) + geom_point() + theme_bw(base_size = 14) + 
    xlab("number of samples") + ylab("number of clusters") + scale_size_continuous(name = "care home count")

ggsave("./Figures/nsamples_vs_nclusters.png", width = 9, height = 7)
ggsave("./Figures/nsamples_vs_nclusters.pdf", width = 9, height = 7)

plot the more interesting networks with at least 7 samples. I am not sure how informative these are.

interesting_homes <- as.character(unique(nintros_by_home$home[nintros_by_home$n_samples >= 
    2]))

vertex_attributes <- clusters[, c(2, 1, 3)] %>% filter(home %in% interesting_homes)
edge_attributes <- trans_with_clusters %>% filter(sampleA %in% vertex_attributes$sample) %>% 
    filter(sampleB %in% vertex_attributes$sample)
p <- exp(apply(edge_attributes[, c("0", "1", "2")], 1, logSumExp))
edge_attributes <- edge_attributes[, c("sampleA", "sampleB", "snp_distance", "date_diff")]
edge_attributes$home <- meta$anonymised_care_home_code[match(edge_attributes$sampleA, 
    meta$anonymised_sample_id)]
edge_attributes$`transmission probability` <- p
edge_attributes <- edge_attributes %>% filter(`transmission probability` >= MIN_PROB)

graph <- igraph::graph_from_data_frame(edge_attributes, vertices = vertex_attributes)


ggraph(graph, layout = "kk") + geom_edge_fan(aes(colour = `transmission probability`)) + 
    geom_node_point(size = 3) + scale_color_brewer(type = "qual", palette = 3) + 
    theme_graph(foreground = "steelblue", fg_text_colour = "white", base_size = 14) + 
    facet_wrap(~home, scales = "free", ncol = 7) + scale_edge_colour_gradientn(colours = c("#313695", 
    "#ffffbf", "#a50026"))

ggsave("./Figures/interesting_transmission_networks.png", width = 15, height = 9)
ggsave("./Figures/all_transmission_networks.pdf", width = 12, height = 25, device = cairo_pdf)

##Healthcare worker analysis

Load data

# The raw sequence data is available as part of the COG-UK data set (PRJEB37886).
# However, the association between sequences and metadata has been retained to
# protect the privacy of patients. To request access please contact the authors.
# hcw_msa <- read.dna('./data/alignment_cuh_hcw_care_20200723.fasta', format =
# 'fasta')
hcw_meta <- fread("./data/cuh_hcw_care_cluster_analysis_set_20200723.csv") %>% as_tibble()
hcw_meta$collection_date_filled <- as.Date(hcw_meta$collection_date_filled, format = "%d/%m/%y")
write.csv(hcw_meta[, 1:2], file = "./processed_data/hcw_dates.csv", quote = FALSE, 
    row.names = FALSE)

run the transcluster algorithm with liberal thresholds initially

mkdir processed_data/hcw
fasttranscluster -o processed_data/hcw --save_probs --msa data/alignment_cuh_hcw_care_20200723.fasta --dates processed_data/hcw_dates.csv -K 20 --snp_threshold 15 -t 4
hcw_trans <- fread("./processed_data/hcw/transcluster_probabilities.csv") %>% as_tibble()
hcw_trans$care_home_A <- hcw_meta$anonymised_care_home_code[match(hcw_trans$sampleA, 
    hcw_meta$anonymised_sample_id)]
hcw_trans$care_home_B <- hcw_meta$anonymised_care_home_code[match(hcw_trans$sampleB, 
    hcw_meta$anonymised_sample_id)]
hcw_trans$dateA <- hcw_meta$collection_date_filled[match(hcw_trans$sampleA, hcw_meta$anonymised_sample_id)]
hcw_trans$dateB <- hcw_meta$collection_date_filled[match(hcw_trans$sampleB, hcw_meta$anonymised_sample_id)]
hcw_trans$date_diff <- abs(hcw_trans$dateA - hcw_trans$dateB)

inital clustering of all individuals

temp_samples <- hcw_meta$anonymised_sample_id
d <- matrix(0, nrow = length(temp_samples), ncol = length(temp_samples), dimnames = list(temp_samples, 
    temp_samples))

d[as.matrix(hcw_trans[, c("sampleA", "sampleB")])] <- exp(apply(hcw_trans[, c("0", 
    "1", "2")], 1, logSumExp))
d[as.matrix(hcw_trans[, c("sampleB", "sampleA")])] <- d[as.matrix(hcw_trans[, c("sampleA", 
    "sampleB")])]
diag(d) <- 1

h <- hclust(as.dist(1 - d), method = "single")
clust <- cutree(h, h = 1 - MIN_PROB)

missing <- temp_samples[!temp_samples %in% names(clust)]

all_clust <- tibble(sample = c(names(clust), missing), cluster = c(paste("CH_HCW", 
    clust, sep = "_"), paste(rep("CH_HCW", length(missing)), max(clust) + seq_along(missing), 
    sep = "_")))

all_clust$hcw_status <- hcw_meta$cuh_hcw_care_status[match(all_clust$sample, hcw_meta$anonymised_sample_id)]

write.csv(all_clust, file = "./processed_data/final_hcw_clustering_combined.csv", 
    quote = FALSE, row.names = FALSE)

plot all on one big graph

vertex_attributes <- hcw_meta

edge_attributes <- hcw_trans

p <- exp(apply(edge_attributes[, c("0", "1", "2")], 1, logSumExp))
edge_attributes <- edge_attributes[, c("sampleA", "sampleB", "snp_distance", "date_diff")]
edge_attributes$home <- hcw_meta$anonymised_care_home_code[match(edge_attributes$sampleA, 
    hcw_meta$anonymised_sample_id)]
edge_attributes$home[edge_attributes$home == ""] <- "HCW"

edge_attributes$`transmission probability` <- p
edge_attributes <- edge_attributes %>% filter(`transmission probability` >= MIN_PROB)
vertex_attributes$status <- ifelse(vertex_attributes$cuh_hcw_care_status == "hcw", 
    "health care worker", "care home resident")
hcws <- hcw_meta$anonymised_sample_id[hcw_meta$cuh_hcw_care_status == "hcw"]
edge_attributes$involves_hcw <- (edge_attributes$sampleA %in% hcws) | (edge_attributes$sampleB %in% 
    hcws)

graph <- igraph::graph_from_data_frame(edge_attributes, vertices = vertex_attributes)

ggraph(graph, layout = "kk") + geom_edge_fan(aes(colour = `transmission probability`, 
    alpha = involves_hcw)) + geom_node_point(aes(colour = status), size = 3) + scale_color_brewer(type = "qual", 
    palette = 3) + theme_graph(foreground = "steelblue", fg_text_colour = "white", 
    base_size = 14) + scale_color_manual(values = c("#377eb8", "#4daf4a")) + scale_edge_alpha_manual(values = c(1, 
    0.2), guide = FALSE) + scale_edge_colour_gradientn(colours = c("#313695", "#ffffbf", 
    "#a50026"))

ggsave("./Figures/hcw_transmission_networks_combined.png", width = 15, height = 9)

cluster into plausible transmission chains seperated by care home allowing HCW’s to appear in multiple care homes.

hcw_clusters <- map_dfr(all_care_homes, ~{
    temp_trans <- hcw_trans %>% filter((care_home_A == .x) | (care_home_A == "")) %>% 
        filter((care_home_B == .x) | (care_home_B == "")) %>% filter(!((care_home_A == 
        "") & (care_home_B == "")))
    
    if (nrow(temp_trans) < 1) 
        return(tibble())
    
    temp_samples <- hcw_meta$anonymised_sample_id[hcw_meta$anonymised_care_home_code %in% 
        c(.x, "")]
    
    if ((length(temp_samples) < 2) | (nrow(temp_trans) < 1)) 
        return(tibble(home = .x, sample = temp_samples, cluster = paste(.x, 1:length(temp_samples), 
            sep = "_")))
    
    d <- matrix(0, nrow = length(temp_samples), ncol = length(temp_samples), dimnames = list(temp_samples, 
        temp_samples))
    
    d[as.matrix(temp_trans[, c("sampleA", "sampleB")])] <- exp(apply(temp_trans[, 
        c("0", "1", "2")], 1, logSumExp))
    d[as.matrix(temp_trans[, c("sampleB", "sampleA")])] <- d[as.matrix(temp_trans[, 
        c("sampleA", "sampleB")])]
    diag(d) <- 1
    
    h <- hclust(as.dist(1 - d), method = "single")
    clust <- cutree(h, h = 1 - MIN_PROB)
    
    missing <- temp_samples[!temp_samples %in% names(clust)]
    
    temp_clust <- tibble(home = .x, sample = c(names(clust), missing), cluster = c(paste(.x, 
        clust, sep = "_"), paste(rep(.x, length(missing)), max(clust) + seq_along(missing), 
        sep = "_")))
    
    temp_clust$hcw_status <- hcw_meta$cuh_hcw_care_status[match(temp_clust$sample, 
        hcw_meta$anonymised_sample_id)]
    
    # only keep those clusters with both HCW and carehome residents
    keep <- temp_clust %>% group_by(cluster) %>% summarise(ncat = length(unique(hcw_status))) %>% 
        filter(ncat > 1)
    
    return(temp_clust %>% filter(cluster %in% keep$cluster))
})

# write out clusters
write.csv(hcw_clusters, file = "./processed_data/final_hcw_clusters.csv", quote = FALSE, 
    row.names = FALSE)

plot the networks that contain both HCW and carehome residents. HCW can be included in multiple clusters as we split by carehome.

interesting_homes <- unique(hcw_clusters$home)

vertex_attributes <- hcw_clusters[, c(2, 1, 3, 4)] %>% filter(home %in% interesting_homes)
# relabel healthcare worker to give them a unique ID per carehome
vertex_attributes$sample <- imap_chr(vertex_attributes$sample, ~{
    if (vertex_attributes$hcw_status[[.y]] == "hcw") {
        paste(.x, vertex_attributes$home[[.y]], sep = "_")
    } else {
        .x
    }
})

edge_attributes <- hcw_trans %>% filter(sampleA %in% hcw_clusters$sample) %>% filter(sampleB %in% 
    hcw_clusters$sample)
hcws <- hcw_meta$anonymised_sample_id[hcw_meta$cuh_hcw_care_status == "hcw"]
edge_attributes <- map_dfr(interesting_homes, ~{
    temp <- edge_attributes
    temp$sampleA[temp$sampleA %in% hcws] <- paste(temp$sampleA[temp$sampleA %in% 
        hcws], .x, sep = "_")
    temp$sampleB[temp$sampleB %in% hcws] <- paste(temp$sampleB[temp$sampleB %in% 
        hcws], .x, sep = "_")
    return(temp)
})

p <- exp(apply(edge_attributes[, c("0", "1", "2")], 1, logSumExp))
edge_attributes <- edge_attributes[, c("sampleA", "sampleB", "snp_distance", "date_diff")]
edge_attributes$home <- hcw_meta$anonymised_care_home_code[match(edge_attributes$sampleA, 
    hcw_meta$anonymised_sample_id)]
edge_attributes$home[is.na(edge_attributes$home)] <- gsub(".*_", "", edge_attributes$sampleA[is.na(edge_attributes$home)])

edge_attributes$`transmission probability` <- p
edge_attributes <- edge_attributes %>% filter(`transmission probability` >= MIN_PROB)

edge_attributes <- edge_attributes %>% filter((sampleA %in% vertex_attributes$sample) & 
    (sampleB %in% vertex_attributes$sample))

edge_attributes$homeA <- hcw_meta$anonymised_care_home_code[match(edge_attributes$sampleA, 
    hcw_meta$anonymised_sample_id)]
edge_attributes$homeA[is.na(edge_attributes$homeA)] <- gsub(".*_", "", edge_attributes$sampleA[is.na(edge_attributes$homeA)])
edge_attributes$homeB <- hcw_meta$anonymised_care_home_code[match(edge_attributes$sampleB, 
    hcw_meta$anonymised_sample_id)]
edge_attributes$homeB[is.na(edge_attributes$homeB)] <- gsub(".*_", "", edge_attributes$sampleB[is.na(edge_attributes$homeB)])

edge_attributes <- edge_attributes %>% filter(homeA == homeB)

link_list <- imap_dfr(hcw_meta$anonymised_sample_id[hcw_meta$cuh_hcw_care_status == 
    "carehome_resident"], ~{
    temp <- edge_attributes %>% filter(sampleA == .x)
    tempB <- edge_attributes[, c(2, 1, 3, 4, 5, 6, 8, 7)] %>% filter(sampleB == .x)
    colnames(tempB) <- colnames(temp)
    temp <- rbind(temp, tempB)
    if (nrow(temp) < 1) 
        return(tibble(resident = .x, care_home = hcw_meta$anonymised_care_home_code[hcw_meta$anonymised_sample_id == 
            .x], hcw_links = "", res_links = ""))
    return(tibble(resident = .x, care_home = unique(temp$home), hcw_links = paste(unique(paste(gsub("_.*", 
        "", temp$sampleB[grepl("_", temp$sampleB)]), format(temp$`transmission probability`[grepl("_", 
        temp$sampleB)], digits = 2), sep = "_")), collapse = ","), res_links = paste(unique(paste(gsub("_.*", 
        "", temp$sampleB[!grepl("_", temp$sampleB)]), format(temp$`transmission probability`[!grepl("_", 
        temp$sampleB)], digits = 2), sep = "_")), collapse = ",")))
})
write.table(link_list, file = "./processed_data/hcw_links__collapsed_by_resident.tsv", 
    sep = "\t", quote = FALSE, row.names = FALSE, col.names = TRUE)


edge_attributes <- edge_attributes[!duplicated(edge_attributes[, c(1, 2)]), ]
edge_attributes$involves_hcw <- grepl("_", edge_attributes$sampleA) | grepl("_", 
    edge_attributes$sampleB)
edge_attributes <- edge_attributes[!(grepl("_", edge_attributes$sampleA) & grepl("_", 
    edge_attributes$sampleB)), ]

edge_attributes <- edge_attributes %>% arrange(-`transmission probability`)

vertex_attributes <- vertex_attributes %>% filter((sample %in% unique(c(edge_attributes$sampleA, 
    edge_attributes$sampleB))) | (hcw_status == "carehome_resident"))


vertex_attributes$status <- ifelse(vertex_attributes$hcw_status == "hcw", "health care worker", 
    "care home resident")

graph <- igraph::graph_from_data_frame(edge_attributes, vertices = vertex_attributes)

ggraph(graph, layout = "kk") + geom_edge_fan(aes(colour = `transmission probability`, 
    alpha = involves_hcw)) + geom_node_point(aes(colour = status), size = 3) + scale_color_brewer(type = "qual", 
    palette = 3) + theme_graph(foreground = "steelblue", fg_text_colour = "white", 
    base_size = 16) + facet_wrap(~home, scales = "free", ncol = 3) + scale_color_manual(values = c("#377eb8", 
    "#4daf4a")) + scale_edge_alpha_manual(values = c(1, 0.2), guide = FALSE) + scale_edge_colour_gradientn(colours = c("#313695", 
    "#ffffbf", "#a50026")) + theme(strip.text.x = element_text(size = 14))

ggsave("./Figures/hcw_transmission_networks.png", width = 9, height = 12, device = cairo_pdf)
ggsave("./Figures/hcw_transmission_networks.pdf", width = 9, height = 12, device = cairo_pdf)


edge_attributes$sampleA <- map_chr(str_split(edge_attributes$sampleA, "_"), ~.x[[1]])
edge_attributes$sampleB <- map_chr(str_split(edge_attributes$sampleB, "_"), ~.x[[1]])
write.csv(edge_attributes[, c("sampleA", "sampleB", "snp_distance", "date_diff", 
    "home", "transmission probability", "involves_hcw")], file = "./processed_data/hcw_transmission_links.csv", 
    quote = FALSE, row.names = FALSE)